library(MCMCpack)
#for K-means clustering on the posterior means of gammas
library(cluster)
#################################################################################
#avg_sil() function will silhouette score for the k-means clustering, when the number of clusetr supplied is k.
#################################################################################
avg_sil <- function(k,gamma) {
  gamma.scaled <-scale(gamma)
  km.res <- kmeans(gamma.scaled, centers = k, nstart = 25)
  ss <- silhouette(km.res$cluster, dist(gamma.scaled))
  mean(ss[, 3])
}
#################################################################################
#getmode() function will get the mode of unique cluster numbers across iterations
#################################################################################
getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}


set.seed(200192)
filestring <- "dp.prior.gamma."
M <- 50 ## number of Monte Carlo replications
## share of item pairs rated by a person among all the unique item pairs
share<-0.03
I.vec <- c(40, 80)       ## number of voters
J.vec <- c(40, 80)  ## number of votes

## If the corr bewteen the estimated par and the truth is less than this threshold,we store the data and the model output
corr.threshold<- 0.5 
cluster.true<-3


# storage vectors
I.storage <- NULL
J.storage <- NULL
m.storage <- NULL
cluster.number.storage<-NULL
kmeans.cluster.number.storage<-NULL
gamma.corr.storage <- NULL
theta.corr.storage <- NULL
gamma.mse.storage <- NULL
theta.mse.storage <- NULL
time.storage <- NULL

## constants across all Monte Carlo runs
burnin <- 500
mcmc   <- 2000
thin   <- 2
MCMCpaircompare2d_seed <- 3840
verbose<-mcmc+burnin+1 #shut the verbose
tune<-0.5 
procrustes<-TRUE
alpha <- 0.01
cluster.max<-20
cluster.mcmc <- 300 # mini MCMC to update the gamma for a cluster
alpha.fixed <-FALSE
a0<-0.01 
b0<-1



for (I in I.vec){
  for (J in J.vec){
    for (m in 1:M){
      start.time <- Sys.time()
      cat(filestring, "  I =", I, "  J =", J, "  m =", m, "\n")
      # generate true thetas
      theta.1.true <- rnorm(J)
      theta.2.true <- rnorm(J)
      # true values for anchoring items
      theta.1.true[1] <- 3
      theta.2.true[1] <- 0
      theta.1.true[2] <- 0
      theta.2.true[2] <- 2
      theta.1.true[3] <- -1
      theta.2.true[3] <- -2.5
      theta.true <- cbind(theta.1.true, theta.2.true)
      # generate true gammas
      gamma.unique <- runif(cluster.true)*pi/2
      # make sure all unique gammas are far away enough from each other. 
      # we don't want to generate gammas that cluster together. If so, the corr between estimated gamma and true gammas will be bad
      while(min(diff(sort(gamma.unique)))<pi/2/(cluster.true+1)){
        gamma.unique <- runif(cluster.true)*pi/2
      }
      gamma.true <- sample(gamma.unique, size=I, replace=TRUE,
                           prob=rep(1/cluster.true, cluster.true))
      
      z.true <- cbind(cos(gamma.true), sin(gamma.true))
      
      ##create item pair comparison sparse matrix format
      npairs.i<-round(J*(J-1)/2*share) ## The number of pairs a rater judges
      rater <- NULL
      o1 <- NULL
      o2 <- NULL
      choice <- NULL
      for (i in 1:I){
        rater.id <- paste("rater", 10000 + i, sep="")
        for (p in 1:npairs.i){
          nums <- sample(1:J, size=2, replace=FALSE)
          o1.num <- nums[1]
          o2.num <- nums[2]
          eta <- t(z.true[i, ]) %*% (theta.true[o1.num, ] - theta.true[o2.num, ])
          prob.o1.chosen <- pnorm(eta)
          u <- runif(1)
          if (u <= prob.o1.chosen){
            chose.num <- o1.num
          }
          else{
            chose.num <- o2.num
          }
          rater <- c(rater, rater.id)
          o1 <- c(o1, paste("o", 10000 + o1.num, sep=""))
          o2 <- c(o2, paste("o", 10000 + o2.num, sep=""))
          choice <- c(choice, paste("o", 10000 + chose.num, sep=""))
        }
      }
      
      mydata <- data.frame(rater, o1, o2, choice)
      
      
      ## generate starting values 
      theta.start <- matrix(rnorm(2*J), nrow = J, ncol = 2)
      gamma.start <- seq(0.03, pi/2-0.03, length.out = cluster.max) 
      
      
      out <- MCMCpack::MCMCpaircompare2dDP(mydata,
                               theta.constraints=list(o10001=list(1,3),
                                                      o10001=list(2,0),
                                                      o10002=list(1,0),
                                                      o10002=list(2,2),
                                                      o10003=list(1, -1),
                                                      o10003=list(2,-2.5)
                               ),
                               verbose=verbose, burnin=burnin, mcmc=mcmc, thin=thin,
                               seed = 5615646, gamma.start=gamma.start,
                               theta.start=theta.start,
                               store.theta=TRUE, store.gamma=TRUE,
                               tune=tune, alpha = alpha,  procrustes=procrustes, cluster.max=cluster.max,
                               cluster.mcmc = cluster.mcmc, alpha.fixed = alpha.fixed, a0=a0 , b0=b0) 
      
      # poterior sample means for gammas and thetas
      gamma <- colMeans(out[, grep("gamma", colnames(out))])
      theta <- colMeans(out[, grep("theta", colnames(out))])
      
      
      # correlation between the estimated parameters and the truth
      gamma.corr<-cor(gamma, gamma.true)
      theta.corr<-cor(theta, as.vector(theta.true))
      
      # mean square errors of the estimated parameters
      gamma.mse<-mean((gamma- gamma.true)^2)
      theta.mse<-mean((theta- as.vector(theta.true))^2)
      
      # use K-means to cluster the posterior sample means of gammas
      # check every possible cluster numbers and pick the best one based Silhouette score
      k.values <- 2:cluster.max
      
      # extract avg silhouette for 2-cluster.max clusters
      avg_sil_values<-rep(NA, length(cluster.max-1))
      for(k in k.values){
        avg_sil_values[k-1] <- avg_sil(k=k,gamma = gamma)
      }
    
      kmeans.cluster.number<-1+which(avg_sil_values==max(avg_sil_values))#the possible cluster numbers always start with 2
      # plot(k.values, avg_sil_values,
      #      type = "b", pch = 19, frame = FALSE, 
      #      xlab = "Number of clusters K",
      #      ylab = "Average Silhouettes")
      
      
      ## store info
      I.storage <- c(I.storage, I)
      J.storage <- c(J.storage, J)
      m.storage <- c(m.storage, m)
      cluster.number.storage<-c(cluster.number.storage, getmode(out[,grep('cluster.number', colnames(out))]) )
      kmeans.cluster.number.storage<-c(kmeans.cluster.number.storage, kmeans.cluster.number)
      
      gamma.corr.storage <- c(gamma.corr.storage, gamma.corr)
      theta.corr.storage <- c(theta.corr.storage, theta.corr)
      gamma.mse.storage <- c(gamma.mse.storage,gamma.mse)
      theta.mse.storage <- c(theta.mse.storage, theta.mse)
      end.time <- Sys.time()
      tot.time <- end.time - start.time
      time.storage <- c(time.storage, tot.time)
      print(tot.time)
      
      
      if(gamma.corr < corr.threshold | theta.corr<corr.threshold){
        filename <- paste(filestring, "dp.true.theta.I", I, "J", J,
                          "m", m, "Rda", sep=".")
        save(theta.true, file=filename)
        filename <- paste(filestring, "dp.true.gamma.I", I, "J", J,
                          "m", m, "Rda", sep=".")
        save(gamma.true, file=filename)
        filename <- paste(filestring, "dp.out.I", I, "J", J,
                          "m", m, "Rda", sep=".")
        save(out, file=filename)
        filename <- paste(filestring, "dp.dat.I", I, "J", J,
                          "m", m, "Rda", sep=".")
        save(mydata, file=filename)     
      }
      
      
      
      
    } ## end M loop
  } ## end J loop
} ## end I loop

print(kmeans.cluster.number.storage)

dp.prior.gamma.out <- data.frame(I=I.storage,
                                      J=J.storage,
                                      m=m.storage,
                                      cluster.number=cluster.number.storage,
                                      kmeans.cluster.number=kmeans.cluster.number.storage,
                                      gamma.corr=gamma.corr.storage,
                                      theta.corr=theta.corr.storage,
                                      gamma.mse=gamma.mse.storage,
                                      theta.mse=theta.mse.storage, 
                                      time=time.storage)


save(dp.prior.gamma.out, file=paste(filestring, "out.Rda", sep=""))









